Importing needed packages
library(ape)
library(dendextend)
library(cluster)
library(tibble)
library(magrittr)
library(dplyr)
library(phytools)
library(mltools)
library(data.table)
library(factoextra)
source("C:/Users/lucru/Estatística_UFSCar/cv_cluster/modules/convert_to_parenthesis.R")
source("C:/Users/lucru/Estatística_UFSCar/cv_cluster/modules/cv_score.R")
library(tictoc)
library(mvMORPH)
library(tidyr)
library(MASS)
library(ggplot2)
library(ggthemes)
library(ggpubr)
Here we have the interest of comparing the scatterplot, dendrogram and score of each hierarchical clustering method in order to do a proof of consent regarding our score, i.e., verify that our score makes sense in hierarchical clustering context. For that, me simulate the data with \(n = 100\) sample points:
set.seed(500)
n = 100
p = 2
Y = sample(c(0, 1), n, replace = TRUE, prob = c(0.5, 0.5))
mu_0 = c(0, 0)
mu_1 = c(2, 2)
S = diag(nrow = 2, ncol = 2)
X = matrix(nrow = n, ncol = p)
X[Y == 0, ] = round(mvrnorm(sum(Y == 0), mu_0, S), 4)
X[Y == 1, ] = round(mvrnorm(sum(Y == 1), mu_1, S), 4)
sim.data = as.data.frame(cbind(X , Y))
sim.data$Y = as.factor(Y)
colnames(sim.data) = c("X1", "X2", "Y")
row.names(sim.data) = c(1:nrow(sim.data))
head(sim.data, 4)
Simulating the “wrong” data:
X_3 = data.frame(X3 = rexp(n, 1))
head(X_3)
Making the ploting function:
plot_all_scores = function(original.data, cons.data, cl.list, dists = NA){
list.names = names(cl.list)
l = length(list.names)
for(k in 1:l){
hc.func = list.names[k]
methods = cl.list[[hc.func]]
m = length(methods)
if(is.na(dists)){
if(m > 0){
for(c in 1:m){
cons.hc = get(hc.func)(dist(scale(cons.data)), method = methods[c])
cons.dend = convert_to_phylo(cons.hc)
score.cons = L_score(cons.dend, original.data[, -3])
original.hc = get(hc.func)(dist(scale(original.data[, -3])), method = methods[c])
original.dend = convert_to_phylo(original.hc)
score.original = L_score(original.dend, original.data[, -3])
p1.original = fviz_dend(original.hc, k = 2)
factors = factor(cutree(original.hc, k = 2))
temp_data = original.data[, -3]
temp_data$labels = factors
p2.original = temp_data %>%
ggplot(aes(x = X1, y = X2, colour = labels)) +
geom_point() +
labs(x = "X1",
y = "X2",
colour = "Labels") +
scale_colour_brewer(palette = "Set1") +
theme_minimal()
p1.cons = fviz_dend(cons.hc, k = 2)
factors = factor(cutree(original.hc, k = 2))
temp_data = original.data[, -3]
temp_data$labels = factors
p2.cons = temp_data %>%
ggplot(aes(x = X1, y = X2, colour = labels)) +
geom_point() +
labs(x = "X1",
y = "X2",
colour = "Labels") +
scale_colour_brewer(palette = "Set1") +
theme_minimal()
t_grob1 = text_grob(paste0(hc.func,".", methods[c],
" original data score: ", round(score.original, 4)), size = 12)
t_grob2 = text_grob(paste0(hc.func,".", methods[c],
" wrong data score: ", round(score.cons, 4)), size = 12)
plot_1 = as_ggplot(t_grob1) + theme(plot.margin = margin(0,3,0,0, "cm"))
plot_2 = as_ggplot(t_grob2) + theme(plot.margin = margin(0,3,0,0, "cm"))
g1 = ggarrange(plot_1,ggarrange(p1.original, p2.original,
ncol = 2), nrow = 2, heights = c(1,5))
g2 = ggarrange(plot_2, ggarrange(p1.cons, p2.cons,
ncol = 2), nrow = 2, heights = c(1,5))
show(g1)
show(g2)
}
}else{
cons.hc = get(hc.func)(dist(scale(cons.data)))
cons.dend = convert_to_phylo(cons.hc)
score.cons = L_score(cons.dend, original.data[, -3])
original.hc = get(hc.funct)(dist(scale(original.data[, -3])))
original.dend = convert_to_phylo(original.hc)
score.original = L_score(original.dend, original.data[, -3])
p1.original = fviz_dend(original.hc, k = 2)
factors = factor(cutree(original.hc, k = 2))
temp_data = original.data[, -3]
temp_data$labels = factors
p2.original = temp_data %>%
ggplot(aes(x = X1, y = X2, colour = labels)) +
geom_point() +
labs(x = "X1",
y = "X2",
colour = "Labels") +
scale_colour_brewer(palette = "Set1") +
theme_minimal()
p1.cons = fviz_dend(cons.hc, k = 2)
factors = factor(cutree(original.hc, k = 2))
temp_data = original.data[, -3]
temp_data$labels = factors
p2.cons = temp_data %>%
ggplot(aes(x = X1, y = X2, colour = labels)) +
geom_point() +
labs(x = "X1",
y = "X2",
colour = "Labels") +
scale_colour_brewer(palette = "Set1") +
theme_minimal()
t_grob1 = text_grob(paste0(hc.func," original data score: ",
round(score.original, 4)))
t_grob2 = text_grob(paste0(hc.func, " wrong data score: ",
round(score.cons, 4)))
plot_1 = as_ggplot(t_grob1) + theme(plot.margin = margin(0,3,0,0, "cm"))
plot_2 = as_ggplot(t_grob2) + theme(plot.margin = margin(0,3,0,0, "cm"))
g1 = ggarrange(plot_1,ggarrange(p1.original, p2.original,
ncol = 2), nrow = 2, heights = c(1,5))
g2 = ggarrange(plot_2, ggarrange(p1.cons, p2.cons,
ncol = 2), nrow = 2, heights = c(1,5))
show(g1)
show(g2)
}
}else{
dists.length = length(dists)
for(h in 1:dists.length){
if(m > 0){
for(c in (1:m)){
cons.hc = get(hc.func)(dist(scale(cons.data), method = dists[h]), method = methods[c])
cons.dend = convert_to_phylo(cons.hc)
score.cons = L_score(cons.dend, original.data[, -3])
original.hc = get(hc.func)(dist(scale(original.data[, -3]), method = dists[h]),
method = methods[c])
original.dend = convert_to_phylo(original.hc)
score.original = L_score(original.dend, original.data[, -3])
p1.original = fviz_dend(original.hc, k = 2)
factors = factor(cutree(original.hc, k = 2))
temp_data = original.data[, -3]
temp_data$labels = factors
p2.original = temp_data %>%
ggplot(aes(x = X1, y = X2, colour = labels)) +
geom_point() +
labs(x = "X1",
y = "X2",
colour = "Labels") +
scale_colour_brewer(palette = "Set1") +
theme_minimal()
p1.cons = fviz_dend(cons.hc, k = 2)
factors = factor(cutree(original.hc, k = 2))
temp_data = original.data[, -3]
temp_data$labels = factors
p2.cons = temp_data %>%
ggplot(aes(x = X1, y = X2, colour = labels)) +
geom_point() +
labs(x = "X1",
y = "X2",
colour = "Labels") +
scale_colour_brewer(palette = "Set1") +
theme_minimal()
t_grob1 = text_grob(paste0(hc.func,".", methods[c],
" original data with ",dists[h]," distance score: ",
round(score.original, 4)))
t_grob2 = text_grob(paste0(hc.func,".", methods[c],
" wrong data with ", dists[h], " distance score: ", round(score.cons, 4)))
plot_1 = as_ggplot(t_grob1) + theme(plot.margin = margin(0,3,0,0, "cm"))
plot_2 = as_ggplot(t_grob2) + theme(plot.margin = margin(0,3,0,0, "cm"))
g1 = ggarrange(plot_1,ggarrange(p1.original, p2.original,
ncol = 2), nrow = 2, heights = c(1,5))
g2 = ggarrange(plot_2, ggarrange(p1.cons, p2.cons,
ncol = 2), nrow = 2, heights = c(1,5))
show(g1)
show(g2)
}
}else{
cons.hc = get(hc.func)(dist(scale(cons.data), method = dists[h]))
cons.dend = convert_to_phylo(cons.hc)
score.cons = L_score(cons.dend, original.data[, -3])
original.hc = hclust(dist(scale(original.data[, -3]), method = dists[h]))
original.dend = convert_to_phylo(original.hc)
score.original = L_score(original.dend, original.data[, -3])
p1.original = fviz_dend(original.hc, k = 2)
factors = factor(cutree(original.hc, k = 2))
temp_data = original.data[, -3]
temp_data$labels = factors
p2.original = temp_data %>%
ggplot(aes(x = X1, y = X2, colour = labels)) +
geom_point() +
labs(x = "X1",
y = "X2",
colour = "Labels") +
scale_colour_brewer(palette = "Set1") +
theme_minimal()
p1.cons = fviz_dend(cons.hc, k = 2)
factors = factor(cutree(original.hc, k = 2))
temp_data = original.data[, -3]
temp_data$labels = factors
p2.cons = temp_data %>%
ggplot(aes(x = X1, y = X2, colour = labels)) +
geom_point() +
labs(x = "X1",
y = "X2",
colour = "Labels") +
scale_colour_brewer(palette = "Set1") +
theme_minimal()
t_grob1 = text_grob(paste0(hc.func, " original data score: ",
round(score.original, 4)))
t_grob2 = text_grob(paste0(hc.func, " wrong data score: ", round(score.cons, 4)))
plot_1 = as_ggplot(t_grob1) + theme(plot.margin = margin(0,3,0,0, "cm"))
plot_2 = as_ggplot(t_grob2) + theme(plot.margin = margin(0,3,0,0, "cm"))
g1 = ggarrange(plot_1,ggarrange(p1.original, p2.original,
ncol = 2), nrow = 2, heights = c(1,5))
g2 = ggarrange(plot_2, ggarrange(p1.cons, p2.cons,
ncol = 2), nrow = 2, heights = c(1,5))
show(g1)
show(g2)
}
}
}
}
}
And now a list of all clustering methods and distances of interest:
dists = c("euclidean", "manhattan", "canberra")
test.list = list(hclust = c("ward.D", "single","ward.D2", "average", "complete", "mcquitty", "median", "centroid"), agnes = c("weighted", "complete", "average", "ward"), diana = NULL)
plot_all_scores(sim.data, X_3, test.list, dists)